data_test <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-evaluation-data.csv", header = TRUE) %>%select(-INDEX)
data_train <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-training-data.csv", header = TRUE) %>% select(-INDEX)[1] 2276 16
[1] 259 15
TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
"integer" "integer" "integer" "integer"
TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
"integer" "integer" "integer" "integer"
TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
"integer" "integer" "integer" "integer"
TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
"integer" "integer" "integer" "integer"
| Name | data_train |
| Number of rows | 2276 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| TARGET_WINS | 0 | 1.00 | 80.79 | 15.75 | 0 | 71.0 | 82.0 | 92.00 | 146 | ▁▁▇▅▁ |
| TEAM_BATTING_H | 0 | 1.00 | 1469.27 | 144.59 | 891 | 1383.0 | 1454.0 | 1537.25 | 2554 | ▁▇▂▁▁ |
| TEAM_BATTING_2B | 0 | 1.00 | 241.25 | 46.80 | 69 | 208.0 | 238.0 | 273.00 | 458 | ▁▆▇▂▁ |
| TEAM_BATTING_3B | 0 | 1.00 | 55.25 | 27.94 | 0 | 34.0 | 47.0 | 72.00 | 223 | ▇▇▂▁▁ |
| TEAM_BATTING_HR | 0 | 1.00 | 99.61 | 60.55 | 0 | 42.0 | 102.0 | 147.00 | 264 | ▇▆▇▅▁ |
| TEAM_BATTING_BB | 0 | 1.00 | 501.56 | 122.67 | 0 | 451.0 | 512.0 | 580.00 | 878 | ▁▁▇▇▁ |
| TEAM_BATTING_SO | 102 | 0.96 | 735.61 | 248.53 | 0 | 548.0 | 750.0 | 930.00 | 1399 | ▁▆▇▇▁ |
| TEAM_BASERUN_SB | 131 | 0.94 | 124.76 | 87.79 | 0 | 66.0 | 101.0 | 156.00 | 697 | ▇▃▁▁▁ |
| TEAM_BASERUN_CS | 772 | 0.66 | 52.80 | 22.96 | 0 | 38.0 | 49.0 | 62.00 | 201 | ▃▇▁▁▁ |
| TEAM_BATTING_HBP | 2085 | 0.08 | 59.36 | 12.97 | 29 | 50.5 | 58.0 | 67.00 | 95 | ▂▇▇▅▁ |
| TEAM_PITCHING_H | 0 | 1.00 | 1779.21 | 1406.84 | 1137 | 1419.0 | 1518.0 | 1682.50 | 30132 | ▇▁▁▁▁ |
| TEAM_PITCHING_HR | 0 | 1.00 | 105.70 | 61.30 | 0 | 50.0 | 107.0 | 150.00 | 343 | ▇▇▆▁▁ |
| TEAM_PITCHING_BB | 0 | 1.00 | 553.01 | 166.36 | 0 | 476.0 | 536.5 | 611.00 | 3645 | ▇▁▁▁▁ |
| TEAM_PITCHING_SO | 102 | 0.96 | 817.73 | 553.09 | 0 | 615.0 | 813.5 | 968.00 | 19278 | ▇▁▁▁▁ |
| TEAM_FIELDING_E | 0 | 1.00 | 246.48 | 227.77 | 65 | 127.0 | 159.0 | 249.25 | 1898 | ▇▁▁▁▁ |
| TEAM_FIELDING_DP | 286 | 0.87 | 146.39 | 26.23 | 52 | 131.0 | 149.0 | 164.00 | 228 | ▁▂▇▆▁ |
| Name | data_test |
| Number of rows | 259 |
| Number of columns | 15 |
| _______________________ | |
| Column type frequency: | |
| numeric | 15 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| TEAM_BATTING_H | 0 | 1.00 | 1469.39 | 150.66 | 819 | 1387.0 | 1455.0 | 1548.00 | 2170 | ▁▂▇▁▁ |
| TEAM_BATTING_2B | 0 | 1.00 | 241.32 | 49.52 | 44 | 210.0 | 239.0 | 278.50 | 376 | ▁▂▇▇▂ |
| TEAM_BATTING_3B | 0 | 1.00 | 55.91 | 27.14 | 14 | 35.0 | 52.0 | 72.00 | 155 | ▇▇▃▁▁ |
| TEAM_BATTING_HR | 0 | 1.00 | 95.63 | 56.33 | 0 | 44.5 | 101.0 | 135.50 | 242 | ▆▅▇▃▁ |
| TEAM_BATTING_BB | 0 | 1.00 | 498.96 | 120.59 | 15 | 436.5 | 509.0 | 565.50 | 792 | ▁▁▅▇▁ |
| TEAM_BATTING_SO | 18 | 0.93 | 709.34 | 243.11 | 0 | 545.0 | 686.0 | 912.00 | 1268 | ▁▃▇▇▂ |
| TEAM_BASERUN_SB | 13 | 0.95 | 123.70 | 93.39 | 0 | 59.0 | 92.0 | 151.75 | 580 | ▇▃▁▁▁ |
| TEAM_BASERUN_CS | 87 | 0.66 | 52.32 | 23.10 | 0 | 38.0 | 49.5 | 63.00 | 154 | ▂▇▃▁▁ |
| TEAM_BATTING_HBP | 240 | 0.07 | 62.37 | 12.71 | 42 | 53.5 | 62.0 | 67.50 | 96 | ▃▇▅▁▁ |
| TEAM_PITCHING_H | 0 | 1.00 | 1813.46 | 1662.91 | 1155 | 1426.5 | 1515.0 | 1681.00 | 22768 | ▇▁▁▁▁ |
| TEAM_PITCHING_HR | 0 | 1.00 | 102.15 | 57.65 | 0 | 52.0 | 104.0 | 142.50 | 336 | ▇▇▆▁▁ |
| TEAM_PITCHING_BB | 0 | 1.00 | 552.42 | 172.95 | 136 | 471.0 | 526.0 | 606.50 | 2008 | ▆▇▁▁▁ |
| TEAM_PITCHING_SO | 18 | 0.93 | 799.67 | 634.31 | 0 | 613.0 | 745.0 | 938.00 | 9963 | ▇▁▁▁▁ |
| TEAM_FIELDING_E | 0 | 1.00 | 249.75 | 230.90 | 73 | 131.0 | 163.0 | 252.00 | 1568 | ▇▁▁▁▁ |
| TEAM_FIELDING_DP | 31 | 0.88 | 146.06 | 25.88 | 69 | 131.0 | 148.0 | 164.00 | 204 | ▁▂▇▇▂ |
data_train %>%
gather(variable, value) %>%
filter(is.na(value)) %>%
group_by(variable) %>%
tally() %>%
mutate(percent = n / nrow(data_train) * 100) %>%
mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
arrange(desc(n)) %>%
rename(`Variable Missing Data` = variable,
`Number of Records` = n,
`Share of Total` = percent) %>%
kable() %>%
kable_styling()| Variable Missing Data | Number of Records | Share of Total |
|---|---|---|
| TEAM_BATTING_HBP | 2085 | 92% |
| TEAM_BASERUN_CS | 772 | 34% |
| TEAM_FIELDING_DP | 286 | 13% |
| TEAM_BASERUN_SB | 131 | 5.8% |
| TEAM_BATTING_SO | 102 | 4.5% |
| TEAM_PITCHING_SO | 102 | 4.5% |
data_test %>%
gather(variable, value) %>%
filter(is.na(value)) %>%
group_by(variable) %>%
tally() %>%
mutate(percent = n / nrow(data_train) * 100) %>%
mutate(percent = paste0(round(percent, ifelse(percent < 10, 1, 0)), "%")) %>%
arrange(desc(n)) %>%
rename(`Variable Missing Data` = variable,
`Number of Records` = n,
`Share of Total` = percent) %>%
kable() %>%
kable_styling()| Variable Missing Data | Number of Records | Share of Total |
|---|---|---|
| TEAM_BATTING_HBP | 240 | 11% |
| TEAM_BASERUN_CS | 87 | 3.8% |
| TEAM_FIELDING_DP | 31 | 1.4% |
| TEAM_BATTING_SO | 18 | 0.8% |
| TEAM_PITCHING_SO | 18 | 0.8% |
| TEAM_BASERUN_SB | 13 | 0.6% |
data_train %>%
gather() %>%
ggplot( aes(x= value)) +
geom_density(fill='pink') +
facet_wrap(~key, scales = 'free')Let’s take a closer look at the TEAM_BASERUN_SB
data_train %>%
ggplot(aes(TEAM_BASERUN_SB)) +
geom_histogram(bins = 50, fill = 'pink') +
geom_vline(aes(xintercept = mean(TEAM_BASERUN_SB, na.rm = T)), col = "red", lty = 2) +
geom_vline(aes(xintercept = median(TEAM_BASERUN_SB, na.rm = T)), col = "blue", lty = 2) +
labs(x = element_blank(),
y = "Count",
title = "Distribution of Stolen Bases",
caption = "* Red line is the mean value and blue is the median") +
theme_classic()ggplot(data_train, aes(x=TARGET_WINS)) +
geom_histogram(aes(y=..density..),binwidth = 3, colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic()Correlations with Response Variable Let’s take a look at how the predictors are correlated with the response variable:
q <- cor(data_train)
ggcorrplot(q, type = "upper", outline.color = "white",
ggtheme = theme_classic,
colors = c("#6D9EC1", "white", "#E46726"),
lab = TRUE, show.legend = FALSE, tl.cex = 8, lab_size = 3) MI: - TEAM_PITCHING_HR (homeruns pitched) is slightly correlated win the number of wins. This is clearly unexpected because pitched homeruns are a negative for the pitching team. - TEAM_PITCHING_SO (strikeouts by pitcher) likely suffers from outliers because we’d expect strikeouts to be a positive thing for the pitching team. There is a large influential point. - TEAM_PITCHING_BB (walks allowed by pitcher) might suffer from outliers. We’d expected a negative correlation since a walk allowed is a negative for the pitching team.
data_train %>%
gather(variable, value, -TARGET_WINS) %>%
ggplot(., aes(value, TARGET_WINS)) +
geom_point(fill = "pink", color="pink") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = "Wins")[1] 0.09550747
data_train <- data_train %>%
mutate(TEAM_BATTING_SO = ifelse(TEAM_BATTING_SO == 0, NA, TEAM_BATTING_SO)) %>%
mutate(TEAM_PITCHING_SO = ifelse(TEAM_PITCHING_SO > 5346, NA, TEAM_PITCHING_SO)) %>%
select(-TEAM_BATTING_HBP)
set.seed(42)
knn <- data_train %>% knnImputation()
impute_me <- is.na(data_train$TEAM_BATTING_SO)
data_train[impute_me,"TEAM_BATTING_SO"] <- knn[impute_me,"TEAM_BATTING_SO"]
impute_me <- is.na(data_train$TEAM_BASERUN_SB)
data_train[impute_me,"TEAM_BASERUN_SB"] <- knn[impute_me,"TEAM_BASERUN_SB"]
impute_me <- is.na(data_train$TEAM_BASERUN_CS)
data_train[impute_me,"TEAM_BASERUN_CS"] <- knn[impute_me,"TEAM_BASERUN_CS"]
impute_me <- is.na(data_train$TEAM_PITCHING_SO)
data_train[impute_me,"TEAM_PITCHING_SO"] <- knn[impute_me,"TEAM_PITCHING_SO"]
impute_me <- is.na(data_train$TEAM_FIELDING_DP)
data_train[impute_me,"TEAM_FIELDING_DP"] <- knn[impute_me,"TEAM_FIELDING_DP"][1] 0
Do the same for the data_test
[1] 0.1047619
data_test <- data_test %>%
mutate(TEAM_BATTING_SO = ifelse(TEAM_BATTING_SO == 0, NA, TEAM_BATTING_SO)) %>%
mutate(TEAM_PITCHING_SO = ifelse(TEAM_PITCHING_SO > 5346, NA, TEAM_PITCHING_SO)) %>%
select(-TEAM_BATTING_HBP)
set.seed(42)
knn <- data_test %>% knnImputation()
impute_me <- is.na(data_test$TEAM_BATTING_SO)
data_test[impute_me,"TEAM_BATTING_SO"] <- knn[impute_me,"TEAM_BATTING_SO"]
impute_me <- is.na(data_test$TEAM_BASERUN_SB)
data_test[impute_me,"TEAM_BASERUN_SB"] <- knn[impute_me,"TEAM_BASERUN_SB"]
impute_me <- is.na(data_test$TEAM_BASERUN_CS)
data_test[impute_me,"TEAM_BASERUN_CS"] <- knn[impute_me,"TEAM_BASERUN_CS"]
impute_me <- is.na(data_test$TEAM_PITCHING_SO)
data_test[impute_me,"TEAM_PITCHING_SO"] <- knn[impute_me,"TEAM_PITCHING_SO"]
impute_me <- is.na(data_test$TEAM_FIELDING_DP)
data_test[impute_me,"TEAM_FIELDING_DP"] <- knn[impute_me,"TEAM_FIELDING_DP"][1] 0
# New variable: TEAM_BATTING_1B
temp_data <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-training-data.csv", header = TRUE) %>% select(-INDEX)
base_data <- temp_data
mod_data <- base_data %>% mutate(TEAM_BATTING_1B = base_data$TEAM_BATTING_H - select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE))
head(mod_data)data_transformed <- mod_data
#Log transform TEAM_BASERUN_CS
data_transformed$TEAM_BASERUN_CS_tform <-log(data_transformed$TEAM_BASERUN_CS)
baserun_cs <- ggplot(data_transformed, aes(x=TEAM_BASERUN_CS)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "TEAM_BASERUN_CS")
baserun_cs_tf <- ggplot(data_transformed, aes(x=TEAM_BASERUN_CS_tform)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "Log Transformed")
#Log transform TEAM_BASERUN_SB
data_transformed$TEAM_BASERUN_SB_tform <-log(data_transformed$TEAM_BASERUN_SB)
baserun_sb <- ggplot(data_transformed, aes(x=TEAM_BASERUN_SB)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "TEAM_BASERUN_SB")
baserun_sb_tf <- ggplot(data_transformed, aes(x=TEAM_BASERUN_SB_tform)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "Log Transformed")
#Log transform TEAM_BATTING_3B
data_transformed$TEAM_BATTING_3B_tform <-log(data_transformed$TEAM_BATTING_3B)
batting_3b <- ggplot(data_transformed, aes(x=TEAM_BATTING_3B)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "TEAM_BATTING_3B")
batting_3b_tf <- ggplot(data_transformed, aes(x=TEAM_BATTING_3B_tform)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "Log Transformed")
#BoxCoxtransform TEAM_BATTING_BB
data_transformed$TEAM_BATTING_BB_tform <- BoxCox(data_transformed$TEAM_BATTING_BB, BoxCoxLambda(data_transformed$TEAM_BATTING_BB))
batting_bb <- ggplot(data_transformed, aes(x=TEAM_BATTING_BB)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "TEAM_BATTING_BB")
batting_bb_tf <- ggplot(data_transformed, aes(x=TEAM_BATTING_BB_tform)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "BoxCox Transformed")
#BoxCoxtransform TEAM_BATTING_H
data_transformed$TEAM_BATTING_H_tform <- BoxCox(data_transformed$TEAM_BATTING_H, BoxCoxLambda(data_transformed$TEAM_BATTING_H))
batting_h <- ggplot(data_transformed, aes(x=TEAM_BATTING_H)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "TEAM_BATTING_H")
batting_h_tf <- ggplot(data_transformed, aes(x=TEAM_BATTING_H_tform)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "BoxCox Transformed")
#BoxCoxtransform TEAM_BATTING_1B
data_transformed$TEAM_BATTING_1B_tform <- BoxCox(data_transformed$TEAM_BATTING_1B, BoxCoxLambda(data_transformed$TEAM_BATTING_1B))
batting_1b <- ggplot(data_transformed, aes(x=TEAM_BATTING_1B)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "TEAM_BATTING_1B")
batting_1b_tf <- ggplot(data_transformed, aes(x=TEAM_BATTING_1B_tform)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "BoxCox Transformed")
#BoxCoxtransform TEAM_FIELDING_E
data_transformed$TEAM_FIELDING_E_tform <- BoxCox(data_transformed$TEAM_FIELDING_E, BoxCoxLambda(data_transformed$TEAM_FIELDING_E))
fielding_e <- ggplot(data_transformed, aes(x=TEAM_FIELDING_E)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "TEAM_FIELDING_E")
fielding_e_tf <- ggplot(data_transformed, aes(x=TEAM_FIELDING_E_tform)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "BoxCox Transformed")
#Log transform TEAM_PITCHING_BB
data_transformed$TEAM_PITCHING_BB_tform <-log(data_transformed$TEAM_PITCHING_BB)
pitching_bb <- ggplot(data_transformed, aes(x=TEAM_PITCHING_BB)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "TEAM_PITCHING_BB")
pitching_bb_tf <- ggplot(data_transformed, aes(x=TEAM_PITCHING_BB_tform)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "Log Transformed")
#BoxCoxtransform TEAM_PITCHING_H
data_transformed$TEAM_PITCHING_H_tform <- BoxCox(data_transformed$TEAM_PITCHING_H, BoxCoxLambda(data_transformed$TEAM_PITCHING_H))
pitching_h <- ggplot(data_transformed, aes(x=TEAM_PITCHING_H)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "TEAM_PITCHING_H")
pitching_h_tf <- ggplot(data_transformed, aes(x=TEAM_PITCHING_H_tform)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "BoxCox Transformed")
#Log transform TEAM_PITCHING_SO
data_transformed$TEAM_PITCHING_SO_tform <-log(data_transformed$TEAM_PITCHING_SO)
pitching_so <- ggplot(data_transformed, aes(x=TEAM_PITCHING_SO)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "TEAM_PITCHING_SO")
pitching_so_tf <- ggplot(data_transformed, aes(x=TEAM_PITCHING_SO_tform)) +
geom_histogram(aes(y=..density..), colour="black", fill="red") +
geom_density(alpha=.8, fill="pink") +
theme_classic() + labs(title = "Log Transformed")
plot_grid(baserun_cs, baserun_cs_tf, baserun_sb, baserun_sb_tf,
batting_3b, batting_3b_tf, batting_bb, batting_bb_tf,
batting_h, batting_h_tf, batting_1b, batting_1b_tf,
fielding_e, fielding_e_tf, pitching_bb, pitching_bb_tf,
pitching_h, pitching_h_tf, pitching_so, pitching_so_tf,
ncol = 2)Do the same for the test set
#Test data transformations to match model
temp_test <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-evaluation-data.csv", header = TRUE) %>%select(-INDEX)
mod_data_test <- temp_test %>% mutate(TEAM_BATTING_1B = TEAM_BATTING_H - select(., TEAM_BATTING_2B:TEAM_BATTING_HR) %>% rowSums(na.rm = FALSE))
test_data_transformed <- mod_data_test
#Log transform TEAM_BASERUN_CS
test_data_transformed$TEAM_BASERUN_CS_tform <-log(test_data_transformed$TEAM_BASERUN_CS)
#Log transform TEAM_BASERUN_SB
test_data_transformed$TEAM_BASERUN_SB_tform <-log(test_data_transformed$TEAM_BASERUN_SB)
#Log transform TEAM_BATTING_3B
test_data_transformed$TEAM_BATTING_3B_tform <-log(test_data_transformed$TEAM_BATTING_3B)
#BoxCoxtransform TEAM_BATTING_BB
test_data_transformed$TEAM_BATTING_BB_tform <- BoxCox(test_data_transformed$TEAM_BATTING_BB, BoxCoxLambda(test_data_transformed$TEAM_BATTING_BB))
#BoxCoxtransform TEAM_BATTING_H
test_data_transformed$TEAM_BATTING_H_tform <- BoxCox(test_data_transformed$TEAM_BATTING_H, BoxCoxLambda(test_data_transformed$TEAM_BATTING_H))
#BoxCoxtransform TEAM_BATTING_1B
test_data_transformed$TEAM_BATTING_1B_tform <- BoxCox(test_data_transformed$TEAM_BATTING_1B, BoxCoxLambda(test_data_transformed$TEAM_BATTING_1B))
#BoxCoxtransform TEAM_FIELDING_E
test_data_transformed$TEAM_FIELDING_E_tform <- BoxCox(test_data_transformed$TEAM_FIELDING_E, BoxCoxLambda(test_data_transformed$TEAM_FIELDING_E))
#Log transform TEAM_PITCHING_BB
test_data_transformed$TEAM_PITCHING_BB_tform <-log(test_data_transformed$TEAM_PITCHING_BB)
#BoxCoxtransform TEAM_PITCHING_H
test_data_transformed$TEAM_PITCHING_H_tform <- BoxCox(test_data_transformed$TEAM_PITCHING_H, BoxCoxLambda(test_data_transformed$TEAM_PITCHING_H))
#Log transform TEAM_PITCHING_SO
test_data_transformed$TEAM_PITCHING_SO_tform <-log(test_data_transformed$TEAM_PITCHING_SO)MI: this passage is just for reference when dealing with leverage point. Can be removed later Outliers & Leverage Points
In summary, an outlier is a point whose standardized residual falls outside the interval from –2 to 2. Recall that a bad leverage point is a leverage point which is also an outlier. Thus, a bad leverage point is a leverage point whose standar- dized residual falls outside the interval from –2 to 2. On the other hand, a good leverage point is a leverage point whose standardized residual falls inside the interval from –2 to 2.
Recall that the rule for simple linear regression for classifying a point as a leverage point is hii > 4/n .
temp_holder <- read.csv("https://raw.githubusercontent.com/salma71/Data_621/master/HW_1/datasets/moneyball-training-data.csv", header = TRUE) %>% select(-INDEX)
add_features <- function(df){
df %>%
mutate(TEAM_BATTING_1B = TEAM_BATTING_H - TEAM_BATTING_2B - TEAM_BATTING_3B - TEAM_BATTING_HR)
}
data_train_mi <- add_features(temp_holder)
# Creating "at bats" variable representing every time a batter steps up to bat
data_train_mi <- data_train_mi %>% mutate(TEAM_BATTING_AB = TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BATTING_HBP + TEAM_BATTING_SO)
# Creating "batting average" variable
data_train_mi <- data_train_mi %>% mutate(TEAM_BATTING_AVG = TEAM_BATTING_H/TEAM_BATTING_AB)
# Creating "on base percentage" representing the proportion of ways to get a base out of total opportunities to hit the ball
data_train_mi <- data_train_mi %>% mutate(TEAM_BATTING_OBP = (TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BATTING_HBP)/(TEAM_BATTING_AB + TEAM_BATTING_BB + TEAM_BATTING_HBP))
# Creating "slugging percentage" which is a weighted sum of hits by number of bases acquired divided by opportunities to hit the ball
data_train_mi <- data_train_mi %>% mutate(TEAM_BATTING_SLG = (TEAM_BATTING_1B + 2*TEAM_BATTING_2B + 3*TEAM_BATTING_3B + 3*TEAM_BATTING_HR)/TEAM_BATTING_AB)Encapsulating the at above into a function. (Use only one of these in final report). Function makes more sense for re-use in prediction
add_advanced_bb_features <- function(df) {
df %>%
mutate(TEAM_BATTING_AB = TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BATTING_HBP + TEAM_BATTING_SO) %>%
mutate(TEAM_BATTING_AVG = TEAM_BATTING_H/TEAM_BATTING_AB) %>%
mutate(TEAM_BATTING_OBP = (TEAM_BATTING_H + TEAM_BATTING_BB + TEAM_BATTING_HBP)/(TEAM_BATTING_AB + TEAM_BATTING_BB + TEAM_BATTING_HBP)) %>%
mutate(TEAM_BATTING_SLG = (TEAM_BATTING_1B + 2*TEAM_BATTING_2B + 3*TEAM_BATTING_3B + 3*TEAM_BATTING_HR)/TEAM_BATTING_AB)
}MI: Too many values are removed. These engineered features rely on TEAM_BATTING_HBP in the calculations but this is the variable with the most missing data. Treatment of this variable will affect downstream results.
Salma
Call:
lm(formula = TARGET_WINS ~ ., data = data_train, na.action = na.omit)
Residuals:
Min 1Q Median 3Q Max
-49.112 -8.463 0.047 8.364 61.603
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.9438573 5.8754934 1.182 0.237396
TEAM_BATTING_H 0.0521644 0.0037624 13.865 < 2e-16 ***
TEAM_BATTING_2B -0.0146400 0.0091385 -1.602 0.109290
TEAM_BATTING_3B 0.0383492 0.0172017 2.229 0.025886 *
TEAM_BATTING_HR 0.0768270 0.0281262 2.732 0.006354 **
TEAM_BATTING_BB 0.0013952 0.0049562 0.281 0.778354
TEAM_BATTING_SO -0.0019862 0.0033961 -0.585 0.558720
TEAM_BASERUN_SB 0.0040365 0.0054119 0.746 0.455838
TEAM_BASERUN_CS 0.0973314 0.0155751 6.249 4.92e-10 ***
TEAM_PITCHING_H -0.0004574 0.0003778 -1.211 0.226086
TEAM_PITCHING_HR 0.0030711 0.0247709 0.124 0.901341
TEAM_PITCHING_BB 0.0108895 0.0032308 3.371 0.000763 ***
TEAM_PITCHING_SO -0.0023276 0.0022387 -1.040 0.298585
TEAM_FIELDING_E -0.0216762 0.0024116 -8.988 < 2e-16 ***
TEAM_FIELDING_DP -0.0961580 0.0139737 -6.881 7.65e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 12.95 on 2261 degrees of freedom
Multiple R-squared: 0.3284, Adjusted R-squared: 0.3242
F-statistic: 78.96 on 14 and 2261 DF, p-value: < 2.2e-16
m2 <- lm(TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HR +TEAM_BATTING_3B +TEAM_BASERUN_CS+TEAM_FIELDING_E+ TEAM_FIELDING_DP, data = data_train)
summary(m2)
Call:
lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HR +
TEAM_BATTING_3B + TEAM_BASERUN_CS + TEAM_FIELDING_E + TEAM_FIELDING_DP,
data = data_train)
Residuals:
Min 1Q Median 3Q Max
-46.717 -8.663 -0.059 8.538 65.935
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.473998 3.915403 1.143 0.253298
TEAM_BATTING_H 0.051169 0.002467 20.741 < 2e-16 ***
TEAM_BATTING_HR 0.073968 0.007638 9.684 < 2e-16 ***
TEAM_BATTING_3B 0.061358 0.016375 3.747 0.000183 ***
TEAM_BASERUN_CS 0.107941 0.012081 8.935 < 2e-16 ***
TEAM_FIELDING_E -0.023632 0.001584 -14.924 < 2e-16 ***
TEAM_FIELDING_DP -0.078465 0.013791 -5.690 1.44e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.06 on 2269 degrees of freedom
Multiple R-squared: 0.3144, Adjusted R-squared: 0.3126
F-statistic: 173.4 on 6 and 2269 DF, p-value: < 2.2e-16
full_formula <- "TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + TEAM_BASERUN_SB + TEAM_BASERUN_CS + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + TEAM_FIELDING_E + TEAM_FIELDING_DP + I(TEAM_BATTING_2B^2) + I(TEAM_BATTING_3B^2) + I(TEAM_BATTING_HR^2) + I(TEAM_BATTING_BB^2) + I(TEAM_BATTING_SO^2) + I(TEAM_BASERUN_SB^2) + I(TEAM_BASERUN_CS^2) + I(TEAM_PITCHING_H^2) + I(TEAM_PITCHING_HR^2) + I(TEAM_PITCHING_BB^2) + I(TEAM_PITCHING_SO^2) + I(TEAM_FIELDING_E^2) + I(TEAM_FIELDING_DP^2) + I(TEAM_BATTING_2B^3) + I(TEAM_BATTING_3B^3) + I(TEAM_BATTING_HR^3) + I(TEAM_BATTING_BB^3) + I(TEAM_BATTING_SO^3) + I(TEAM_BASERUN_SB^3) + I(TEAM_BASERUN_CS^3) + I(TEAM_PITCHING_H^3) + I(TEAM_PITCHING_HR^3) + I(TEAM_PITCHING_BB^3) + I(TEAM_PITCHING_SO^3) + I(TEAM_FIELDING_E^3) + I(TEAM_FIELDING_DP^3) + I(TEAM_BATTING_2B^4) + I(TEAM_BATTING_3B^4) + I(TEAM_BATTING_HR^4) + I(TEAM_BATTING_BB^4) + I(TEAM_BATTING_SO^4) + I(TEAM_BASERUN_SB^4) + I(TEAM_BASERUN_CS^4) + I(TEAM_PITCHING_H^4) + I(TEAM_PITCHING_HR^4) + I(TEAM_PITCHING_BB^4) + I(TEAM_PITCHING_SO^4) + I(TEAM_FIELDING_E^4) + I(TEAM_FIELDING_DP^4) "
m3 <- lm(full_formula, data_train)
step_back <- MASS::stepAIC(m3, direction="backward", trace = F)
poly_call <- summary(step_back)$call
step_back <- lm(poly_call[2], data_train)
summary(step_back)
Call:
lm(formula = poly_call[2], data = data_train)
Residuals:
Min 1Q Median 3Q Max
-54.736 -7.452 0.017 7.344 61.619
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.055e+01 5.511e+01 -0.191 0.848198
TEAM_BATTING_2B 1.519e+00 1.809e-01 8.396 < 2e-16 ***
TEAM_BATTING_3B 3.764e-01 1.871e-01 2.012 0.044312 *
TEAM_BATTING_BB 7.492e-01 9.617e-02 7.790 1.02e-14 ***
TEAM_BASERUN_SB 4.464e-02 1.191e-02 3.748 0.000182 ***
TEAM_PITCHING_H 4.189e-02 3.584e-03 11.689 < 2e-16 ***
TEAM_PITCHING_BB -6.975e-02 1.195e-02 -5.836 6.14e-09 ***
TEAM_PITCHING_SO -2.545e-02 4.404e-03 -5.779 8.57e-09 ***
TEAM_FIELDING_E -2.289e-01 2.252e-02 -10.166 < 2e-16 ***
TEAM_FIELDING_DP -3.639e+00 1.629e+00 -2.233 0.025639 *
I(TEAM_BATTING_2B^2) -5.951e-03 7.210e-04 -8.254 2.58e-16 ***
I(TEAM_BATTING_3B^2) -5.323e-03 3.696e-03 -1.440 0.149948
I(TEAM_BATTING_HR^2) -2.369e-03 9.823e-04 -2.412 0.015967 *
I(TEAM_BATTING_BB^2) -2.499e-03 3.194e-04 -7.825 7.78e-15 ***
I(TEAM_BATTING_SO^2) 3.270e-05 8.862e-06 3.690 0.000229 ***
I(TEAM_BASERUN_CS^2) -1.157e-03 8.030e-04 -1.441 0.149837
I(TEAM_PITCHING_H^2) -4.327e-06 4.723e-07 -9.161 < 2e-16 ***
I(TEAM_PITCHING_HR^2) 3.015e-03 7.715e-04 3.909 9.56e-05 ***
I(TEAM_PITCHING_SO^2) 2.245e-06 1.159e-06 1.937 0.052925 .
I(TEAM_FIELDING_E^2) 3.929e-04 5.385e-05 7.296 4.10e-13 ***
I(TEAM_FIELDING_DP^2) 3.887e-02 1.806e-02 2.153 0.031455 *
I(TEAM_BATTING_2B^3) 7.573e-06 9.411e-07 8.047 1.36e-15 ***
I(TEAM_BATTING_3B^3) 4.442e-05 2.803e-05 1.585 0.113173
I(TEAM_BATTING_HR^3) 1.575e-05 7.177e-06 2.194 0.028302 *
I(TEAM_BATTING_BB^3) 3.690e-06 4.879e-07 7.564 5.68e-14 ***
I(TEAM_BATTING_SO^3) -2.345e-08 5.875e-09 -3.992 6.76e-05 ***
I(TEAM_BASERUN_SB^3) -2.227e-07 1.443e-07 -1.543 0.122933
I(TEAM_BASERUN_CS^3) 2.134e-05 9.901e-06 2.156 0.031216 *
I(TEAM_PITCHING_H^3) 1.688e-10 2.696e-11 6.262 4.54e-10 ***
I(TEAM_PITCHING_HR^3) -1.683e-05 4.989e-06 -3.374 0.000753 ***
I(TEAM_PITCHING_BB^3) 1.061e-08 2.997e-09 3.540 0.000408 ***
I(TEAM_FIELDING_E^3) -3.019e-07 4.806e-08 -6.281 4.02e-10 ***
I(TEAM_FIELDING_DP^3) -1.854e-04 8.639e-05 -2.146 0.031983 *
I(TEAM_BATTING_3B^4) -1.265e-07 6.979e-08 -1.813 0.069995 .
I(TEAM_BATTING_HR^4) -2.933e-08 1.495e-08 -1.962 0.049907 *
I(TEAM_BATTING_BB^4) -1.874e-09 2.652e-10 -7.064 2.16e-12 ***
I(TEAM_BASERUN_SB^4) 3.276e-10 2.064e-10 1.587 0.112665
I(TEAM_BASERUN_CS^4) -6.719e-08 3.306e-08 -2.032 0.042243 *
I(TEAM_PITCHING_H^4) -2.130e-15 4.961e-16 -4.295 1.82e-05 ***
I(TEAM_PITCHING_HR^4) 2.649e-08 8.868e-09 2.988 0.002842 **
I(TEAM_PITCHING_BB^4) -1.522e-12 6.952e-13 -2.189 0.028685 *
I(TEAM_FIELDING_E^4) 7.607e-11 1.416e-11 5.371 8.66e-08 ***
I(TEAM_FIELDING_DP^4) 3.253e-07 1.511e-07 2.153 0.031439 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 11.95 on 2233 degrees of freedom
Multiple R-squared: 0.4351, Adjusted R-squared: 0.4245
F-statistic: 40.95 on 42 and 2233 DF, p-value: < 2.2e-16
Using Transformation - Dhairav
#data with all transformed variables
m4_data <- select(data_transformed,-TEAM_BASERUN_CS, -TEAM_BASERUN_SB, -TEAM_BATTING_3B, -TEAM_BATTING_BB, -TEAM_BATTING_H, -TEAM_FIELDING_E, -TEAM_PITCHING_BB, -TEAM_PITCHING_H, -TEAM_PITCHING_SO, -TEAM_BATTING_HBP, -TEAM_BASERUN_CS, -TEAM_BATTING_1B)
Call:
lm(formula = TARGET_WINS ~ ., data = m4_data)
Residuals:
Min 1Q Median 3Q Max
-33.994 -6.794 0.131 6.903 31.095
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.713e+04 3.049e+04 -1.873 0.061213 .
TEAM_BATTING_2B -5.845e-02 2.991e-02 -1.954 0.050836 .
TEAM_BATTING_HR 8.265e-02 7.497e-02 1.102 0.270442
TEAM_BATTING_SO -3.289e-02 9.775e-03 -3.365 0.000786 ***
TEAM_PITCHING_HR 2.690e-02 6.422e-02 0.419 0.675433
TEAM_FIELDING_DP -1.084e-01 1.343e-02 -8.069 1.46e-15 ***
TEAM_BASERUN_CS_tform 9.830e-01 1.067e+00 0.921 0.356943
TEAM_BASERUN_SB_tform 3.803e+00 8.515e-01 4.466 8.57e-06 ***
TEAM_BATTING_3B_tform 6.310e+00 1.456e+00 4.335 1.56e-05 ***
TEAM_BATTING_BB_tform 5.846e-05 2.713e-05 2.155 0.031329 *
TEAM_BATTING_H_tform 8.573e+04 7.161e+04 1.197 0.231415
TEAM_BATTING_1B_tform 1.446e+04 3.134e+04 0.462 0.644473
TEAM_FIELDING_E_tform -1.168e+03 9.249e+01 -12.630 < 2e-16 ***
TEAM_PITCHING_BB_tform 4.444e+00 8.335e+00 0.533 0.594005
TEAM_PITCHING_H_tform -4.167e+04 2.492e+04 -1.672 0.094777 .
TEAM_PITCHING_SO_tform 8.444e+00 7.733e+00 1.092 0.275075
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.828 on 1470 degrees of freedom
(790 observations deleted due to missingness)
Multiple R-squared: 0.4066, Adjusted R-squared: 0.4005
F-statistic: 67.14 on 15 and 1470 DF, p-value: < 2.2e-16
Using Stepwise backward elimination
m4a <- update(m4, . ~ . -TEAM_PITCHING_HR)
# summary(m4a)
m4b <- update(m4a, . ~ . -TEAM_BATTING_1B_tform)
# summary(m4b)
m4c <- update(m4b, . ~ . -TEAM_PITCHING_BB_tform)
# summary(m4c)
m4d <- update(m4c, . ~ . -TEAM_PITCHING_SO_tform)
# summary(m4d)
m4e <- update(m4d, . ~ . -TEAM_PITCHING_H_tform)
summary(m4e)
Call:
lm(formula = TARGET_WINS ~ TEAM_BATTING_2B + TEAM_BATTING_HR +
TEAM_BATTING_SO + TEAM_FIELDING_DP + TEAM_BASERUN_CS_tform +
TEAM_BASERUN_SB_tform + TEAM_BATTING_3B_tform + TEAM_BATTING_BB_tform +
TEAM_BATTING_H_tform + TEAM_FIELDING_E_tform, data = m4_data)
Residuals:
Min 1Q Median 3Q Max
-33.525 -7.082 0.271 6.968 31.845
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.994e+04 9.958e+03 -7.024 3.29e-12 ***
TEAM_BATTING_2B -7.585e-02 9.613e-03 -7.890 5.85e-15 ***
TEAM_BATTING_HR 9.978e-02 9.736e-03 10.249 < 2e-16 ***
TEAM_BATTING_SO -2.190e-02 2.503e-03 -8.750 < 2e-16 ***
TEAM_FIELDING_DP -1.084e-01 1.340e-02 -8.086 1.27e-15 ***
TEAM_BASERUN_CS_tform 7.508e-01 1.061e+00 0.707 0.479
TEAM_BASERUN_SB_tform 4.026e+00 8.372e-01 4.809 1.67e-06 ***
TEAM_BATTING_3B_tform 5.518e+00 9.491e-01 5.814 7.45e-09 ***
TEAM_BATTING_BB_tform 7.186e-05 6.130e-06 11.723 < 2e-16 ***
TEAM_BATTING_H_tform 7.145e+04 9.968e+03 7.168 1.20e-12 ***
TEAM_FIELDING_E_tform -1.190e+03 9.144e+01 -13.015 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.835 on 1475 degrees of freedom
(790 observations deleted due to missingness)
Multiple R-squared: 0.4037, Adjusted R-squared: 0.3997
F-statistic: 99.88 on 10 and 1475 DF, p-value: < 2.2e-16
Mael
data_train_mi %>%
gather(variable, value, -c(TARGET_WINS:TEAM_BATTING_1B)) %>%
ggplot(., aes(value, TARGET_WINS)) +
geom_point(fill = "pink", color="pink") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = "Wins")mi_m1 <- lm(TARGET_WINS ~ TEAM_BATTING_AB + TEAM_BATTING_AVG + TEAM_BATTING_OBP + TEAM_BATTING_SLG, data = data_train_mi)
summary(mi_m1)
Call:
lm(formula = TARGET_WINS ~ TEAM_BATTING_AB + TEAM_BATTING_AVG +
TEAM_BATTING_OBP + TEAM_BATTING_SLG, data = data_train_mi)
Residuals:
Min 1Q Median 3Q Max
-22.7778 -6.4023 -0.3835 5.8471 24.0195
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.438e+02 3.023e+01 -8.066 8.70e-14 ***
TEAM_BATTING_AB 2.083e-02 6.001e-03 3.471 0.000644 ***
TEAM_BATTING_AVG -6.611e+02 1.111e+02 -5.950 1.31e-08 ***
TEAM_BATTING_OBP 8.939e+02 1.156e+02 7.733 6.42e-13 ***
TEAM_BATTING_SLG 1.049e+02 3.922e+01 2.674 0.008161 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 9.105 on 186 degrees of freedom
(2085 observations deleted due to missingness)
Multiple R-squared: 0.447, Adjusted R-squared: 0.4351
F-statistic: 37.59 on 4 and 186 DF, p-value: < 2.2e-16
The standardized residual plots show quite a few points outside the -2,2 range, which might justify removing those observations.
Residuals vs Fitted: while the line is not quite horizontal, the constant variance assumption seems met Normal Q-Q plot: normality assumption is met Root(Squared Residuals) vs Fitted Values: Residuals vs Leverage: a few points have standardized residuals outside the (-2,2) ranhe which might justify removing those observations.
MI: tyring out predictions on this model
# data_test_mi <- add_features(data_test)
# data_test_mi <- add_advanced_bb_features(data_test_mi)
# pred.w.plim <- predict(mi_m1, data_test_mi, interval = "prediction")
# pred.w.clim <- predict(mi_m1, data_test_mi, interval = "confidence")